#This does a minimal cleaning of the passenger car data (vehicle registrations gathered monthly)





#GET VEHICLE REGISTRATION NAMES AND PATHS#
filenames_reg<-list.files(pattern="Gekentekende_voertuigen.*?.RData" )
filedate_reg<-str_match(filenames_reg, "Gekentekende_voertuigen_(.*?).RData" )[,2]
filedate_reg<-gsub("[Jj]anuari", "january",filedate_reg )
filedate_reg<-gsub("[Ff]ebruari", "february",filedate_reg )
filedate_reg<-gsub("[Mm]aart", "march",filedate_reg )
filedate_reg<-gsub("[Mm]ei", "may",filedate_reg )
filedate_reg<-gsub("[Jj]uni", "june",filedate_reg )
filedate_reg<-gsub("[Jj]uli", "july",filedate_reg )
filedate_reg<-gsub("[Aa]ugustus", "august",filedate_reg )
filedate_reg<-gsub("[Oo]ktober", "october",filedate_reg )
Files_reg<-cbind.data.frame(filenames_reg,as.Date(dmy(filedate_reg)),stringsAsFactors=FALSE)
colnames(Files_reg)<-c("path", "date")
Files_reg<- Files_reg[order(Files_reg$date),]
Files_reg$order<-order(Files_reg$date)
rm(filedate_reg, filenames_reg)




#LOAD STARTING FILE (first)# 
load(Files_reg$path[1])


#Keep only passenger cars
x<-subset(x, Voertuigsoort=="Personenauto")
#Keep only certain body times (the largest groups)
x<-subset(x, Inrichting %in% c("cabriolet", "coupe", "hatchback", "MPV", "Niet geregistreerd", "sedan",
                               "stationwagen"))


#Drop cars with number of wheels different than 4
x<-subset(x, Aantal.wielen==4)




#Drop dots from variable names
names(x)<-gsub("\\.", "",names(x))


#All lower case
names(x)<-tolower(names(x))

#Transform factor in character
i <- sapply(x, is.factor)
x[i] <- lapply(x[i], as.character)



#Get subset of information
x<-subset(x, select=c(kenteken, typegoedkeuringsnummer, variant, uitvoering, vervaldatumapk, datumtenaamstelling, datumeersteafgiftenederland, datumeerstetoelating, exportindicator, openstaandeterugroepactieindicator))
#Free memory space
gc()

#If dates not with /, add them
i<- grepl("/", x$vervaldatumapk)==FALSE & is.na(x$vervaldatumapk)==FALSE & x$vervaldatumapk!="" 
x$vervaldatumapk[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\1/\\2/\\3" ,x$vervaldatumapk[i])

i<- grepl("/", x$datumtenaamstelling)==FALSE & is.na(x$datumtenaamstelling)==FALSE & x$datumtenaamstelling!=""
x$datumtenaamstelling[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumtenaamstelling[i])

i<- grepl("/", x$datumeersteafgiftenederland)==FALSE & is.na(x$datumeersteafgiftenederland)==FALSE & x$datumeersteafgiftenederland!=""
x$datumeersteafgiftenederland[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumeersteafgiftenederland[i])

i<- grepl("/", x$datumeerstetoelating)==FALSE & is.na(x$datumeerstetoelating)==FALSE & x$datumeerstetoelating!=""
x$datumeerstetoelating[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumeerstetoelating[i])

rm(i)
gc()

#This becomes the main file
Main<-x
rm(x)
gc()

Main$vervaldatumapk_all<-Main$vervaldatumapk
Main$datumtenaamstelling_all<-Main$datumtenaamstelling
Main$exported_last<-NA
Main$scrappage_date<-NA
Main$recall_no<-NA
Main$recall_yes<-NA
i<-Main$openstaandeterugroepactieindicator=="Nee" & is.na(Main$openstaandeterugroepactieindicator)==FALSE & Main$openstaandeterugroepactieindicator!=""
Main$recall_no[i]<-as.character(format(Files_reg$date[1], "%d/%m/%Y"))
rm(i)
i<-Main$openstaandeterugroepactieindicator=="Ja" & is.na(Main$openstaandeterugroepactieindicator)==FALSE & Main$openstaandeterugroepactieindicator!=""
Main$recall_yes[i]<-as.character(format(Files_reg$date[1], "%d/%m/%Y"))
rm(i)
Main$recall_new<-NA
Main$recall_fixed<-NA



print(paste("Iteration ", 1, " ", Files_reg$date[1], " ended", sep=""))
print(object.size(Main), units="Mb")
gc() 

max<-nrow(Files_reg)

#Save temporary file
save(Main, file="CleanRegTemp.RData")
rm(Main)
gc()


#GET THE SECONDARY FILE TO APPEND
for(j in 2:max) {
  date<-Files_reg$date[j]
  load(Files_reg$path[j])
  
  #Keep only passenger cars
  x<-subset(x, Voertuigsoort=="Personenauto")
  #Keep only certain body times (the largest groups)
  x<-subset(x, Inrichting %in% c("cabriolet", "coupe", "hatchback", "MPV", "Niet geregistreerd", "sedan",
                                 "stationwagen"))

  #Drop cars with number of wheels different than 4
  x<-subset(x, Aantal.wielen==4)
  
  #Drop dots from variable names
  names(x)<-gsub("\\.", "",names(x))
  

  
  #All lower case
  names(x)<-tolower(names(x))
  
  #Transform factor in character
  i <- sapply(x, is.factor)
  x[i] <- lapply(x[i], as.character)
  
  #Select only certain variables
  x<-subset(x, select=c(kenteken, typegoedkeuringsnummer, variant, uitvoering, vervaldatumapk, datumtenaamstelling, datumeersteafgiftenederland, datumeerstetoelating, exportindicator, openstaandeterugroepactieindicator))
  #Free up memory space
  gc()
  
  #If dates not with /, add them
  i<- grepl("/", x$vervaldatumapk)==FALSE & is.na(x$vervaldatumapk)==FALSE & x$vervaldatumapk!="" 
  x$vervaldatumapk[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\1/\\2/\\3" ,x$vervaldatumapk[i])
  
  i<- grepl("/", x$datumtenaamstelling)==FALSE & is.na(x$datumtenaamstelling)==FALSE & x$datumtenaamstelling!=""
  x$datumtenaamstelling[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumtenaamstelling[i])
  
  i<- grepl("/", x$datumeersteafgiftenederland)==FALSE & is.na(x$datumeersteafgiftenederland)==FALSE & x$datumeersteafgiftenederland!=""
  x$datumeersteafgiftenederland[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumeersteafgiftenederland[i])
  
  i<- grepl("/", x$datumeerstetoelating)==FALSE & is.na(x$datumeerstetoelating)==FALSE & x$datumeerstetoelating!=""
  x$datumeerstetoelating[i]<-gsub("^([[:digit:]]{4})([[:digit:]]{2})([[:digit:]]{2})$", "\\3/\\2/\\1" ,x$datumeerstetoelating[i])
  
  rm(i)
  gc()
  
  
  #Now load previous Main file
  load("CleanRegTemp.RData")
  
  Main<-merge(Main, x, by=c("kenteken"), all.x=TRUE, all.y=TRUE)
  #Drop old file and free up memory
  rm(x)
  gc()
  
  #Create variables for type approval, variant, version
  Main$typegoedkeuringsnummer[is.na(Main$typegoedkeuringsnummer.y)==FALSE]<-Main$typegoedkeuringsnummer.y[is.na(Main$typegoedkeuringsnummer.y)==FALSE]
  Main$typegoedkeuringsnummer.x<-NULL
  Main$typegoedkeuringsnummer.y<-NULL
  
  Main$variant[is.na(Main$variant.y)==FALSE]<-Main$variant.y[is.na(Main$variant.y)==FALSE]
  Main$variant.x<-NULL
  Main$variant.y<-NULL
  
  Main$uitvoering[is.na(Main$uitvoering.y)==FALSE]<-Main$uitvoering.y[is.na(Main$uitvoering.y)==FALSE]
  Main$uitvoering.x<-NULL
  Main$uitvoering.y<-NULL
  gc()
  
  #COLLECT ROUTINE VEHICLE CHECK DATES
  #Check new date for check (not empty, not for cars out of the market)
  #TWO VARIABLES:
  #vervaldatumapk_all, that contains ALL check dates known so far, separated by semicolumns
  #vervaldatumapk, that contains only the most recent check date
  
  #Replace NA with blank
  Main$vervaldatumapk_all[is.na(Main$vervaldatumapk_all)==TRUE]<-""
  
  #ADD APK CHECK DATE
  #Conditions:
  #1) previous date different than current date AND current date not blank
  #OR
  #2) previous date blank AND current date not blank
  i<-(Main$vervaldatumapk.x!=Main$vervaldatumapk.y & is.na(Main$vervaldatumapk.y)==FALSE & Main$vervaldatumapk.y!="")| (is.na(Main$vervaldatumapk.x)==TRUE &
                                                                                                                          is.na(Main$vervaldatumapk.y)==FALSE & Main$vervaldatumapk.y!="") 
  #If conditions met, add current date to vervaldatumapk_all
  Main$vervaldatumapk_all[i]<-paste(Main$vervaldatumapk_all[i], Main$vervaldatumapk.y[i], sep=";")
  #Substitute most recent apk check date
  Main$vervaldatumapk<-Main$vervaldatumapk.y
  Main$vervaldatumapk.x<-Main$vervaldatumapk.y<-NULL
  rm(i)
  gc()
  
  #COLLECT CAR RESALES DATES
  #TWO VARIABLES:
  #datumtenaamstelling_all, that contains ALL resell dates known so far, separated by semicolumns
  #datumtenaamstelling, that contains only the most recent resell
  #Collect all car resales, separated by semicolumns

  
  #Conditions:
  #1) Previous date different than current date AND current date not blank
  i<-Main$datumtenaamstelling.x!=Main$datumtenaamstelling.y  & is.na(Main$datumtenaamstelling.y)==FALSE
  i[is.na(i)==TRUE]<-TRUE
  #If conditions met, append to main file
  Main$datumtenaamstelling_all[i]<-paste(Main$datumtenaamstelling_all[i], Main$datumtenaamstelling.y[i], sep=";")
  Main$datumtenaamstelling<-Main$datumtenaamstelling.y
  Main$datumtenaamstelling.y<-Main$datumtenaamstelling.x<-NULL
  rm(i)
  gc()
  
  
  #COLLECT DATE OF LAST EXPORT
  #exported_last tells the approximate month of export
  #Create variable of date of last export (based on download month)
  #Conditions:
  #Previous export indicator either no or blank AND current export indicator yes
  i<-(Main$exportindicator.x=="Nee"|is.na(Main$exportindicator.x)) & Main$exportindicator.y=="Ja"
  Main$exported_last[i]<-format(as.Date(Files_reg$date[(j-1)],origin="1970-01-01"))  
  rm(i)
  gc()
  
  #GET SCRAPPAGE DATE
  #scrappage_date is the date (based on month of download) in which the car appeared for the last time
  #Create variable of scrappage (real, not exported cars that "disappear" from dataset)
  #Based on first date of sale (can be any other variable stable over time)
  #Conditions:
  #Previous first date of sale not blank AND current first date of sale blank AND previous export indicator is no
  i<-is.na(Main$datumeerstetoelating.x)==FALSE & is.na(Main$datumeerstetoelating.y)==TRUE & Main$exportindicator.x=="Nee"
  Main$scrappage_date[i]<-format(as.Date(Files_reg$date[(j-1)],origin="1970-01-01"))  
  rm(i)
  gc()
  
  #GET EXPORT DATE WHEN CAR IS SCRAPPED AND INDICATOR OF EXPORT IS YES
  #Exported cars disappear from the dataset after 2 years
  #For the early cases, we cannot observe when the export took place, but we can infer.
  #If car scrapped but exported, then get approximate date of exported_last (-2 years)
  #Condition: first date of sale not null AND current first date of sale not null AND previous export indicator is yes
  i<-is.na(Main$datumeerstetoelating.x)==FALSE & is.na(Main$datumeerstetoelating.y)==TRUE & Main$exportindicator.x=="Ja"
  #If conditions met, get export date going back by 2 years
  tempdate<-as.character(Files_reg$date[(j-1)])
  substring(tempdate, 1,4)<-as.character(as.numeric(substr(tempdate, 1,4))-2)
  tempdate<-format(as.Date(ymd(tempdate),origin="1970-01-01"))   
  Main$exported_last[i]<-tempdate
  
  Main$exportindicator<-Main$exportindicator.y
  Main$exportindicator.y<-Main$exportindicator.x<-NULL
  rm(i, tempdate)
  gc()
  
  #UPDATE FIRST REGISTRATION DATE
  #In really few cases dates of first registration are updated, so we always update the first registration date with the last available
  i<-is.na(Main$datumeersteafgiftenederland.y)==FALSE
  Main$datumeersteafgiftenederland<-Main$datumeersteafgiftenederland.x
  Main$datumeersteafgiftenederland[i]<-Main$datumeersteafgiftenederland.y[i]
  Main$datumeersteafgiftenederland.x<-Main$datumeersteafgiftenederland.y<-NULL
  
  Main$datumeerstetoelating<-Main$datumeerstetoelating.x
  Main$datumeerstetoelating[i]<-Main$datumeerstetoelating.y[i]
  Main$datumeerstetoelating.x<-Main$datumeerstetoelating.y<-NULL
  rm(i)
  
  
  
  
  #NO RECALL
  i<-Main$openstaandeterugroepactieindicator.y=="Nee" & is.na(Main$openstaandeterugroepactieindicator.y)==FALSE & Main$openstaandeterugroepactieindicator.y!=""
  Main$recall_no[i]<-paste(Main$recall_no[i], as.character(format(date, "%d/%m/%Y")), sep=";")
  rm(i)
  gc()
  
  #YES RECALL
  i<-Main$openstaandeterugroepactieindicator.y=="Ja" & is.na(Main$openstaandeterugroepactieindicator.y)==FALSE & Main$openstaandeterugroepactieindicator.y!=""
  Main$recall_yes[i]<-paste(Main$recall_yes[i], as.character(format(date, "%d/%m/%Y")), sep=";")
  rm(i)
  gc()
  
  #NEW RECALL
  i<-Main$openstaandeterugroepactieindicator.y=="Ja" & Main$openstaandeterugroepactieindicator.x=="Nee" &
    is.na(Main$openstaandeterugroepactieindicator.x)==FALSE & Main$openstaandeterugroepactieindicator.x!="" &
    is.na(Main$openstaandeterugroepactieindicator.y)==FALSE & Main$openstaandeterugroepactieindicator.y!=""
  table(i)
  Main$recall_new[i]<-paste(Main$recall_new[i], as.character(format(date, "%d/%m/%Y")), sep=";")
  rm(i)
  gc()
  
  #RECALL FIXED
  i<-Main$openstaandeterugroepactieindicator.y=="Nee" & Main$openstaandeterugroepactieindicator.x=="Ja" &
    is.na(Main$openstaandeterugroepactieindicator.x)==FALSE & Main$openstaandeterugroepactieindicator.x!="" &
    is.na(Main$openstaandeterugroepactieindicator.y)==FALSE & Main$openstaandeterugroepactieindicator.y!=""
  table(i)
  Main$recall_fixed[i]<-paste(Main$recall_fixed[i], as.character(format(date, "%d/%m/%Y")), sep=";")
  rm(i)
  gc()
  
  
  
  
  Main$openstaandeterugroepactieindicator<-Main$openstaandeterugroepactieindicator.y
  Main$openstaandeterugroepactieindicator.y<-Main$openstaandeterugroepactieindicator.x<-NULL
  
  
  
  save(Main, file="CleanRegTemp.RData")
  print(paste("Iteration ", j, " ", date, " ended", sep=""))
  print(object.size(Main), units="Mb")
  rm(date, Main)
  gc() 
} #End clean secondary file


load("CleanRegTemp.RData")
#Remove dates of check and resell (they are already saved)
Main$datumtenaamstelling<-NULL
Main$vervaldatumapk<-NULL

#Save
save(Main, file="Registration_merged.RData")
